R will run in parallel. A number of packages help with this. Some require “forking” which Windows will not support.
p_load(parallel)
p_load(MASS)
starts <- rep(100,4)
fx <- function(nstart) kmeans(Boston, 4, nstart=nstart)
numCores <- detectCores()
numCores
## [1] 16
microbenchmark(lapply(starts, fx))
## Unit: milliseconds
## expr min lq mean median uq max neval
## lapply(starts, fx) 61.9797 68.4986 72.4378 72.72365 76.56555 87.6348 100
microbenchmark(lapply(starts, fx), mclapply(starts, fx, mc.cores=numCores))
## Error in mclapply(starts, fx, mc.cores = numCores): 'mc.cores' > 1 is not supported on Windows
microbenchmark(results1 <- lapply(starts, fx),
results2 <- mclapply(starts, fx, mc.cores=numCores)
)
## Error in mclapply(starts, fx, mc.cores = numCores): 'mc.cores' > 1 is not supported on Windows
microbenchmark(lapply(starts, fx), mclapply(starts, fx))
## Unit: milliseconds
## expr min lq mean median uq max
## lapply(starts, fx) 60.9104 64.69205 68.09924 65.56640 67.60510 152.7054
## mclapply(starts, fx) 61.7964 64.67095 66.80696 65.71785 68.90565 76.5378
## neval cld
## 100 a
## 100 a
microbenchmark(results1 <- lapply(starts, fx),
results2 <- mclapply(starts, fx)
)
## Unit: milliseconds
## expr min lq mean median uq
## results1 <- lapply(starts, fx) 61.8234 64.20815 65.32908 65.1188 65.9042
## results2 <- mclapply(starts, fx) 59.7178 64.34565 66.41706 65.0911 66.1897
## max neval cld
## 74.3293 100 a
## 144.0106 100 a
We can use the doParallel package with foreach to improve looping.
First some setup.
p_load(foreach)
p_load(doParallel)
registerDoParallel(2)
getDoParWorkers()
## [1] 2
getDoParName()
## [1] "doParallelSNOW"
getDoParVersion()
## [1] "1.0.17"
Now carry out some simple calculations to show how to use parallel processing.
microbenchmark(
for (i in 1:10){
sqrt(i)
},
foreach(i=1:10) %do% {
sqrt(i)
},
foreach (i=1:10) %dopar% {
sqrt(i)
},
foreach (i=1:10, .combine=c) %dopar% {
sqrt(i)
}
)
## Unit: microseconds
## expr min lq
## for (i in 1:10) { sqrt(i) } 831.6 916.50
## foreach(i = 1:10) %do% { sqrt(i) } 1882.1 1983.10
## foreach(i = 1:10) %dopar% { sqrt(i) } 3087.3 3468.10
## foreach(i = 1:10, .combine = c) %dopar% { sqrt(i) } 3146.5 3489.35
## mean median uq max neval cld
## 989.478 949.75 1010.45 1545.7 100 a
## 2166.570 2036.55 2210.00 4726.9 100 b
## 3843.430 3629.45 4010.95 8234.5 100 c
## 3997.699 3666.30 4028.05 22876.0 100 c
Note that the use of parallel processing in this case is not helping.
Try some bootstrapping to check performance on a “real” problem.
x <- iris[which(iris[,5] != "setosa"), c(1,5)]
trials <- 10000
microbenchmark(
### Parallel (%dopar%)
bs1 <- foreach(icount(trials), .combine=cbind) %dopar% {
ind <- sample(100, 100, replace=TRUE)
result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit))
coefficients(result1)
},
### Not parallel (%do%)
bs2 <- foreach(icount(trials), .combine=cbind) %do% {
ind <- sample(100, 100, replace=TRUE)
result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit))
coefficients(result1)
}
)
## Unit: seconds
## expr
## bs1 <- foreach(icount(trials), .combine = cbind) %dopar% { ind <- sample(100, 100, replace = TRUE) result1 <- glm(x[ind, 2] ~ x[ind, 1], family = binomial(logit)) coefficients(result1) }
## bs2 <- foreach(icount(trials), .combine = cbind) %do% { ind <- sample(100, 100, replace = TRUE) result1 <- glm(x[ind, 2] ~ x[ind, 1], family = binomial(logit)) coefficients(result1) }
## min lq mean median uq max neval cld
## 8.597918 8.713557 8.835058 8.757424 8.937364 10.64344 100 a
## 11.985286 12.108203 12.316349 12.264483 12.420281 14.46708 100 b
### Look at the bootstrap since we ran it
bs1 <- t(bs1)
bs2 <- t(bs2)
apply(bs1,2,mean)
## (Intercept) x[ind, 1]
## -13.137360 2.104112
apply(bs1,2,var)
## (Intercept) x[ind, 1]
## 9.6451353 0.2473618
apply(bs1,2,hist)
## $`(Intercept)`
## $breaks
## [1] -32 -30 -28 -26 -24 -22 -20 -18 -16 -14 -12 -10 -8 -6 -4 -2
##
## $counts
## [1] 1 3 10 22 51 159 428 976 1932 2602 2339 1171 283 22 1
##
## $density
## [1] 0.00005 0.00015 0.00050 0.00110 0.00255 0.00795 0.02140 0.04880 0.09660
## [10] 0.13010 0.11695 0.05855 0.01415 0.00110 0.00005
##
## $mids
## [1] -31 -29 -27 -25 -23 -21 -19 -17 -15 -13 -11 -9 -7 -5 -3
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $`x[ind, 1]`
## $breaks
## [1] 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5
##
## $counts
## [1] 28 907 3570 3532 1489 377 74 19 3 1
##
## $density
## [1] 0.0056 0.1814 0.7140 0.7064 0.2978 0.0754 0.0148 0.0038 0.0006 0.0002
##
## $mids
## [1] 0.75 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75 5.25
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
apply(bs2,2,mean)
## (Intercept) x[ind, 1]
## -13.166303 2.109399
apply(bs2,2,var)
## (Intercept) x[ind, 1]
## 9.9107502 0.2536903
apply(bs2,2,hist)
## $`(Intercept)`
## $breaks
## [1] -34 -32 -30 -28 -26 -24 -22 -20 -18 -16 -14 -12 -10 -8 -6 -4 -2
##
## $counts
## [1] 1 1 1 3 15 72 188 454 970 1934 2549 2311 1212 257 31
## [16] 1
##
## $density
## [1] 0.00005 0.00005 0.00005 0.00015 0.00075 0.00360 0.00940 0.02270 0.04850
## [10] 0.09670 0.12745 0.11555 0.06060 0.01285 0.00155 0.00005
##
## $mids
## [1] -33 -31 -29 -27 -25 -23 -21 -19 -17 -15 -13 -11 -9 -7 -5 -3
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $`x[ind, 1]`
## $breaks
## [1] 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5
##
## $counts
## [1] 44 901 3539 3510 1482 420 89 12 1 2
##
## $density
## [1] 0.0088 0.1802 0.7078 0.7020 0.2964 0.0840 0.0178 0.0024 0.0002 0.0004
##
## $mids
## [1] 0.75 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75 5.25
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"